#installation
!pip install pycountry-convert
Requirement already satisfied: pycountry-convert in c:\programdata\anaconda3\lib\site-packages (0.7.2) Requirement already satisfied: pprintpp>=0.3.0 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (0.4.0) Requirement already satisfied: repoze.lru>=0.7 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (0.7) Requirement already satisfied: pytest-mock>=1.6.3 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (3.10.0) Requirement already satisfied: pytest>=3.4.0 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (6.2.3) Requirement already satisfied: wheel>=0.30.0 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (0.36.2) Requirement already satisfied: pytest-cov>=2.5.1 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (4.0.0) Requirement already satisfied: pycountry>=16.11.27.1 in c:\programdata\anaconda3\lib\site-packages (from pycountry-convert) (22.3.5) Requirement already satisfied: setuptools in c:\programdata\anaconda3\lib\site-packages (from pycountry>=16.11.27.1->pycountry-convert) (52.0.0.post20210125) Requirement already satisfied: attrs>=19.2.0 in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (20.3.0) Requirement already satisfied: iniconfig in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (1.1.1) Requirement already satisfied: packaging in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (20.9) Requirement already satisfied: pluggy<1.0.0a1,>=0.12 in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (0.13.1) Requirement already satisfied: py>=1.8.2 in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (1.10.0) Requirement already satisfied: toml in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (0.10.2) Requirement already satisfied: atomicwrites>=1.0 in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (1.4.0) Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from pytest>=3.4.0->pycountry-convert) (0.4.4) Requirement already satisfied: coverage[toml]>=5.2.1 in c:\programdata\anaconda3\lib\site-packages (from pytest-cov>=2.5.1->pycountry-convert) (7.0.5) Requirement already satisfied: tomli in c:\programdata\anaconda3\lib\site-packages (from coverage[toml]>=5.2.1->pytest-cov>=2.5.1->pycountry-convert) (2.0.1) Requirement already satisfied: pyparsing>=2.0.2 in c:\programdata\anaconda3\lib\site-packages (from packaging->pytest>=3.4.0->pycountry-convert) (2.4.7)
WARNING: Ignoring invalid distribution -portlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -umpy (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -mportlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution - (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ensorflow-gpu (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -portlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -umpy (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -mportlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution - (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ensorflow-gpu (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -portlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -umpy (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -mportlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution - (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ensorflow-gpu (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -portlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -umpy (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -mportlib-metadata (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution - (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ensorflow-gpu (c:\programdata\anaconda3\lib\site-packages) WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available. You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.
import os
os.listdir()
['.ipynb_checkpoints', 'DataSets', 'Insights.ipynb']
pwd
'E:\\DataScience\\MachineLearning\\YouTube Data Insights'
path ='E:\\DataScience\\MachineLearning\\YouTube Data Insights'
dataset =path+"\\DataSets"
os.listdir(dataset)
['List of most-subscribed YouTube channels (1).csv']
# Importing Basics libray
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly
import plotly.express as px
import plotly.io as pio
pio._templates.default ="plotly_dark"
from plotly.subplots import make_subplots
from scipy import signal
from wordcloud import WordCloud ,STOPWORDS
# To Supress Warning
import warnings
warnings.filterwarnings('ignore')
# Map Visulaization
import folium
from folium import Marker
from folium.plugins import MarkerCluster
# To make shell more attractive
from IPython.display import display
# Set Size of Charts
plt.rcParams['figure.figsize'] =(16,8)
plt.style.use('fivethirtyeight')
df =pd.read_csv(dataset+"\\List of most-subscribed YouTube channels (1).csv")
df.head(10).style.background_gradient(cmap = 'rocket_r')
| Rank | Name | Brand channel | Subscribers (millions) | Primary language | Category | Country | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | T-Series | Yes | 232.000000 | Hindi[7][8] | Music | India |
| 1 | 2 | Cocomelon | Yes | 150.000000 | English | Education | United States |
| 2 | 3 | Sony Entertainment Television India | Yes | 149.000000 | Hindi[9] | Entertainment | India |
| 3 | 4 | MrBeast | No | 124.000000 | English | Entertainment | United States |
| 4 | 5 | PewDiePie | No | 111.000000 | English | Entertainment | Sweden |
| 5 | 6 | Kids Diana Show | Yes | 106.000000 | English[10][11][12] | Film | Ukraine |
| 6 | 7 | Like Nastya | No | 103.000000 | English | Entertainment | United States |
| 7 | 8 | WWE | Yes | 92.400000 | English | Professional wrestling | United States |
| 8 | 9 | Vlad and Niki | No | 91.900000 | English | Entertainment | Russia |
| 9 | 10 | Zee Music Company | Yes | 91.000000 | Hindi[13][14] | Music | India |
df.describe().style.background_gradient(cmap = 'rocket_r')
| Rank | Subscribers (millions) | |
|---|---|---|
| count | 50.000000 | 50.000000 |
| mean | 25.500000 | 70.442000 |
| std | 14.577380 | 34.660653 |
| min | 1.000000 | 44.000000 |
| 25% | 13.250000 | 50.050000 |
| 50% | 25.500000 | 57.500000 |
| 75% | 37.750000 | 77.475000 |
| max | 50.000000 | 232.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rank 50 non-null int64 1 Name 50 non-null object 2 Brand channel 50 non-null object 3 Subscribers (millions) 50 non-null float64 4 Primary language 50 non-null object 5 Category 50 non-null object 6 Country 50 non-null object dtypes: float64(1), int64(1), object(5) memory usage: 2.9+ KB
#is there any row having null finally sum up those from each col.
df.isna().sum()
Rank 0 Name 0 Brand channel 0 Subscribers (millions) 0 Primary language 0 Category 0 Country 0 dtype: int64
df.columns
Index(['Rank', 'Name', 'Brand channel', 'Subscribers (millions)',
'Primary language', 'Category', 'Country'],
dtype='object')
# Just making commom lang.
df['Primary language']=df['Primary language'].replace('English[10][11][12]','English').replace('Hindi[13][14]','Hindi').replace('Hindi[16]','Hindi').replace('Hindi[7][8]','Hindi').replace('Hindi[9]','Hindi')
#unique data in each coulmns
df.nunique(axis=0)
Rank 50 Name 49 Brand channel 2 Subscribers (millions) 48 Primary language 6 Category 9 Country 16 dtype: int64
df['Subscribers (millions)'].nlargest(11)
0 232.0 1 150.0 2 149.0 3 124.0 4 111.0 5 106.0 6 103.0 7 92.4 8 91.9 9 91.0 10 83.6 Name: Subscribers (millions), dtype: float64
# Change the data type to int ..
df['Subscribers (millions)'] = df['Subscribers (millions)'].astype(int)
df_10=df.loc[df['Subscribers (millions)']>83]
df_10.head(10).style.background_gradient(cmap = 'rocket_r')
| Rank | Name | Brand channel | Subscribers (millions) | Primary language | Category | Country | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | T-Series | Yes | 232 | Hindi | Music | India |
| 1 | 2 | Cocomelon | Yes | 150 | English | Education | United States |
| 2 | 3 | Sony Entertainment Television India | Yes | 149 | Hindi | Entertainment | India |
| 3 | 4 | MrBeast | No | 124 | English | Entertainment | United States |
| 4 | 5 | PewDiePie | No | 111 | English | Entertainment | Sweden |
| 5 | 6 | Kids Diana Show | Yes | 106 | English | Film | Ukraine |
| 6 | 7 | Like Nastya | No | 103 | English | Entertainment | United States |
| 7 | 8 | WWE | Yes | 92 | English | Professional wrestling | United States |
| 8 | 9 | Vlad and Niki | No | 91 | English | Entertainment | Russia |
| 9 | 10 | Zee Music Company | Yes | 91 | Hindi | Music | India |
fig = px.bar(df_10,x=df_10['Name'],y=df_10['Subscribers (millions)'].nlargest(10),
color=df_10['Name'],title="Top 10 YouTube Channels Which have Highest Numbers of Subscriber",template='plotly_dark')
fig.update_xaxes(title_text='YouTube Channels')
fig.update_yaxes(title_text='Subscribers (millions)')
fig.show()
fig = px.bar(df_10,x=df_10['Brand channel'].value_counts().index,y=df_10['Brand channel'].value_counts(),
color=df_10['Brand channel'].value_counts().index,title="Top 10 YouTube Channels : count of brand channels",template='plotly_dark')
fig.update_xaxes(title_text='Brand or not')
fig.update_yaxes(title_text='Count')
fig.show()
fig = px.pie(df_10, values=df_10['Category'].value_counts(), names=df_10['Category'].value_counts().index
, title='Amongs Top 10 Channels : Category %',hole = .65,template='plotly_dark')
fig.show()
fig = px.pie(df_10, values=df_10['Primary language'].value_counts(), names=df_10['Primary language'].value_counts().index
, title='Amongs Top 10 Channels : Language %',hole = .65,template='plotly_dark')
fig.show()
fig = px.pie(df_10, values=df_10['Country'].value_counts(), names=df_10['Country'].value_counts().index
, title='Amongs Top 10 Channels : Country %',hole = .65,template='plotly_dark')
fig.show()
fig = px.pie(df, values=df['Brand channel'].value_counts(), names=df['Brand channel'].value_counts().index
,title='Brand Presence : %',hole =0.2,template='plotly_dark')
fig.show()
fig = px.pie(df, values=df['Primary language'].value_counts(), names=df['Primary language'].value_counts().index
,title='Language : %',hole =0.35,template='plotly_dark')
fig.show()
fig = px.pie(df, values=df['Category'].value_counts(), names=df['Category'].value_counts().index
,title='Category : Class',hole =0.35,template='plotly_dark')
fig.show()
fig = px.bar(df,x=df['Country'].value_counts().index,y=df['Country'].value_counts(),
color=df['Country'].value_counts().index,title="Country With Count of Channel Registered",template='plotly_dark')
fig.update_xaxes(title_text='Country')
fig.update_yaxes(title_text='Counts')
fig.show()
#Preparing data for Map plots
frame = { 'Country': df['Country'].value_counts().index, 'Total':df['Country'].value_counts()}
df_18 = pd.DataFrame(frame)
df_18.reset_index(inplace = True,drop=True)
df_18
| Country | Total | |
|---|---|---|
| 0 | India | 17 |
| 1 | United States | 12 |
| 2 | Brazil | 4 |
| 3 | South Korea | 4 |
| 4 | Cyprus[a] | 2 |
| 5 | El Salvador | 1 |
| 6 | Argentina | 1 |
| 7 | Russia | 1 |
| 8 | United States ( Puerto Rico) | 1 |
| 9 | Mexico | 1 |
| 10 | Romania | 1 |
| 11 | Ukraine | 1 |
| 12 | Canada | 1 |
| 13 | United Kingdom | 1 |
| 14 | Chile | 1 |
| 15 | Sweden | 1 |
df_18.info()
# Country Columns in obj need to convt.
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16 entries, 0 to 15 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 16 non-null object 1 Total 16 non-null int64 dtypes: int64(1), object(1) memory usage: 384.0+ bytes
#Converting object columns to string
df_18['Country'] = df_18['Country'].astype('string')
df_18.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16 entries, 0 to 15 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 16 non-null string 1 Total 16 non-null int64 dtypes: int64(1), string(1) memory usage: 384.0 bytes
#function to convert to alpah3 country codes and continents
from pycountry_convert import country_name_to_country_alpha3
from pycountry_convert import country_name_to_country_alpha2
from pycountry_convert import country_alpha2_to_continent_code
df_18['Country']
0 India 1 United States 2 Brazil 3 South Korea 4 Cyprus[a] 5 El Salvador 6 Argentina 7 Russia 8 United States ( Puerto Rico) 9 Mexico 10 Romania 11 Ukraine 12 Canada 13 United Kingdom 14 Chile 15 Sweden Name: Country, dtype: string
lst=[]
for cnt in df_18['Country']:
lst.append(cnt.replace('\xa0', ''))
lst
['India', 'United States', 'Brazil', 'South Korea', 'Cyprus[a]', 'El Salvador', 'Argentina', 'Russia', 'United States(Puerto Rico)', 'Mexico', 'Romania', 'Ukraine', 'Canada', 'United Kingdom', 'Chile', 'Sweden']
cd=[]
cn=[]
for country in lst:
#print(country)
try:
code =country_name_to_country_alpha3(country)
except:
code ='Unknown'
try:
code2 =country_name_to_country_alpha2(country)
except:
code2 ='Unknown'
try:
code1= country_alpha2_to_continent_code(code2)
except:
code1 ='Unknown'
cd.append(code)
cn.append(code1)
cd =pd.Series(cd)
cn=pd.Series(cn)
df_18['CountryCode'] = cd
df_18['ContinentCode'] = cn
df_18
| Country | Total | CountryCode | ContinentCode | |
|---|---|---|---|---|
| 0 | India | 17 | IND | AS |
| 1 | United States | 12 | USA | NA |
| 2 | Brazil | 4 | BRA | SA |
| 3 | South Korea | 4 | KOR | AS |
| 4 | Cyprus[a] | 2 | Unknown | Unknown |
| 5 | El Salvador | 1 | SLV | NA |
| 6 | Argentina | 1 | ARG | SA |
| 7 | Russia | 1 | RUS | EU |
| 8 | United States ( Puerto Rico) | 1 | Unknown | Unknown |
| 9 | Mexico | 1 | MEX | NA |
| 10 | Romania | 1 | ROU | EU |
| 11 | Ukraine | 1 | UKR | EU |
| 12 | Canada | 1 | CAN | NA |
| 13 | United Kingdom | 1 | GBR | EU |
| 14 | Chile | 1 | CHL | SA |
| 15 | Sweden | 1 | SWE | EU |
fig = px.scatter_geo(df_18, locations="CountryCode",
color="Total",
hover_name="Country",
size="Total",
title = 'Distribution of Registered YouTube Channels using Bubble Maps',
projection="natural earth",
color_continuous_scale =px.colors.sequential.Rainbow,
template='plotly_dark'
)
fig.show()
fig = px.choropleth(df_18,
locations = "CountryCode",
color = "Total",
hover_name = "Country",
color_continuous_scale =px.colors.sequential.Rainbow,
projection = 'orthographic',
title = 'Distribution of Registerd Youtube Channels in World Map',
height = 600,
width = 1000,
template='plotly_dark'
)
fig.update_geos(lataxis_showgrid = True,
lonaxis_showgrid = True,
showcountries = True,
)
fig.update_geos(lataxis = {'gridcolor':'#222222'},
lonaxis = {'gridcolor':'#222222'},
)
fig.show()
df_18['ContinentCode'].value_counts()
EU 5 NA 4 SA 3 AS 2 Unknown 2 Name: ContinentCode, dtype: int64
fig = px.bar(df_18,x=df_18['ContinentCode'].value_counts().index,y=df_18['ContinentCode'].value_counts(),
color=df_18['ContinentCode'].value_counts().index,title="continent Wise YouTube Channels",template='plotly_dark')
fig.update_xaxes(title_text='continent')
fig.update_yaxes(title_text='Count')
fig.show()
## Co-relations
import phik
from phik.report import plot_correlation_matrix
from phik import report
phik_overview= df.phik_matrix()
interval columns not set, guessing: ['Rank', 'Subscribers (millions)']
phik_overview.style.background_gradient(cmap = 'rocket_r')
| Rank | Name | Brand channel | Subscribers (millions) | Primary language | Category | Country | |
|---|---|---|---|---|---|---|---|
| Rank | 1.000000 | 0.934369 | 0.102468 | 0.764759 | 0.667582 | 0.080475 | 0.574447 |
| Name | 0.934369 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| Brand channel | 0.102468 | 1.000000 | 1.000000 | 0.122210 | 0.703057 | 0.290028 | 0.851007 |
| Subscribers (millions) | 0.764759 | 1.000000 | 0.122210 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| Primary language | 0.667582 | 1.000000 | 0.703057 | 0.000000 | 1.000000 | 0.000000 | 0.934360 |
| Category | 0.080475 | 1.000000 | 0.290028 | 0.000000 | 0.000000 | 1.000000 | 0.302373 |
| Country | 0.574447 | 1.000000 | 0.851007 | 0.000000 | 0.934360 | 0.302373 | 1.000000 |
![]()
# Author ~ Saurabh Kumar..........xxxxxxxxxxxxxxxxxxxxxxxxx